# General packages
library(tidyverse)
library(janitor)
library(plotly)
library(RColorBrewer)
# Packages for cluster analysis:
library(NbClust)
library(cluster)
library(factoextra)
library(dendextend)
library(ggdendro)
# Packages for text mining/sentiment analysis/word cloud
library(pdftools)
library(tidytext)
library(wordcloud)
Nevermind…back to irises dataset?
iris_nice <- iris %>%
clean_names()
ggplot(iris_nice) +
geom_point(aes(x = petal_length, y = petal_width, color = species))
ggplot(iris_nice) +
geom_point(aes(x = sepal_length, y = sepal_width, color = species))
# How many clusters do you THINK there should be?
number_est <- NbClust(iris_nice[1:4], min.nc = 2, max.nc = 10, method = "kmeans")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 10 proposed 2 as the best number of clusters
## * 8 proposed 3 as the best number of clusters
## * 2 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 1 proposed 8 as the best number of clusters
## * 1 proposed 10 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
# By these estimators, 2 is the best number of clusters...but should that change our mind? Maybe...
# What if we consider similarities across all four variables?
iris_km <- kmeans(iris_nice[1:4], 3) # kmeans specifying 3 groups!
iris_km$size
## [1] 62 38 50
iris_km$centers
## sepal_length sepal_width petal_length petal_width
## 1 5.901613 2.748387 4.393548 1.433871
## 2 6.850000 3.073684 5.742105 2.071053
## 3 5.006000 3.428000 1.462000 0.246000
iris_km$cluster
## [1] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3
## [36] 3 3 3 3 3 3 3 3 3 3 3 3 3 3 3 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 2 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 1 2 2 2
## [106] 2 1 2 2 2 2 2 2 1 1 2 2 2 2 1 2 1 2 1 2 2 1 1 2 2 2 2 2 1 2 2 2 2 1 2
## [141] 2 2 1 2 2 2 1 2 2 1
# Bind the cluster number to the original data
iris_cl <- data.frame(iris_nice, cluster_no = factor(iris_km$cluster))
ggplot(iris_cl) +
geom_point(aes(x = sepal_length, y = sepal_width, color = cluster_no))
A little better…
ggplot(iris_cl) +
geom_point(aes(x = petal_length,
y = petal_width,
color = cluster_no,
pch = species)) +
scale_color_brewer(palette = "Set2")
Make it 3D with plot_ly()…
# Or, a 3D plot with plotly
plot_ly(x = iris_cl$petal_length,
y = iris_cl$petal_width,
z = iris_cl$sepal_width,
type = "scatter3d",
color = iris_cl$cluster_no,
symbol = ~iris_cl$species,
marker = list(size = 3),
colors = "Set1")
## No scatter3d mode specifed:
## Setting the mode to markers
## Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode
Hierarchical cluster analysis (dendrograms) in R
Relevant functions:
stats::hclust() - agglomerative hierarchical clustering cluster::diana() - divisive hierarchical clustering
We’ll be using WorldBank environmental data (simplified), wb_env.csv
# Get the data
wb_env <- read_csv("wb_env.csv")
## Parsed with column specification:
## cols(
## name = col_character(),
## region = col_character(),
## electricity = col_double(),
## agland = col_double(),
## co2 = col_double(),
## methane = col_double(),
## ghg = col_double()
## )
# Only keep top 20 greenhouse gas emitters (for simplifying visualization here...)
wb_ghg_20 <- wb_env %>%
arrange(-ghg) %>%
head(20)
# Scale it (can consider this for k-means clustering, too...)
wb_scaled <- as.data.frame(scale(wb_ghg_20[3:7]))
# Update to add rownames (country name)
rownames(wb_scaled) <- wb_ghg_20$name
# Compute dissimilarity values (Euclidean distances):
diss <- dist(wb_scaled, method = "euclidean")
# Hierarchical clustering (complete linkage)
hc_complete <- hclust(diss, method = "complete" )
# Plot it (base plot):
plot(hc_complete, cex = 0.6, hang = -1)
Divisive clustering:
hc_div <- diana(diss)
plot(hc_div, hang = -1)
## Warning in plot.window(xlim, ylim, log = log, ...): "hang" is not a
## graphical parameter
## Warning in title(main = main, sub = sub, xlab = xlab, ylab = ylab, ...):
## "hang" is not a graphical parameter
## Warning in axis(1, at = at.vals, labels = lab.vals, ...): "hang" is not a
## graphical parameter
## Warning in axis(ax$side, at = 0:(length(x$order) - 1), las = 1, labels =
## labels, : "hang" is not a graphical parameter
rect.hclust(hc_div, k = 4, border = 2:5)
We might want to compare those…because they differ slightly.
# Convert to class dendrogram
dend1 <- as.dendrogram(hc_complete)
dend2 <- as.dendrogram(hc_div)
# Combine into list
dend_list <- dendlist(dend1,dend2)
tanglegram(dend1, dend2)
# Convert to class 'dendro' for ggplotting
data1 <- dendro_data(hc_complete)
# Simple plot with ggdendrogram
ggdendrogram(hc_complete,
rotate = TRUE) +
theme_minimal() +
labs(x = "Country")
# Want to do it actually in ggplot? Here:
label_data <- bind_cols(filter(segment(data1), x == xend & x%%1 == 0), label(data1))
ggplot() +
geom_segment(data=segment(data1), aes(x=x, y=y, xend=xend, yend=yend)) +
geom_text(data=label_data, aes(x=xend, y=yend, label=label, hjust=0), size=2) +
coord_flip() +
scale_y_reverse(expand=c(0.2, 0)) +
theme_bw() +
theme(panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_blank(),
axis.title = element_blank(),
axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "None")
Note: for a more complete text analysis introduction, I recommend forking and working through Casey O’Hara and Jessica Couture’s eco-data-sci workshop (available here https://github.com/oharac/text_workshop)
We’ll use pdftools to extra text from PDFs, then do some analysis
greta_thunberg <- file.path("greta_thunberg.pdf")
thunberg_text <- pdf_text(greta_thunberg)
# Just call thunberg_text in the console to see the full text
From Casey and Jessica’s workshop:
“pdf_text() returns a vector of strings, one for each page of the pdf. So we can mess with it in tidyverse style, let’s turn it into a dataframe, and keep track of the pages.
Then we can use stringr::str_split() to break the pages up into individual lines. Each line of the pdf is concluded with a backslash-n, so split on this. We will also add a line number in addition to the page number."
First, make it a data frame and do some wrangling.
thunberg_df <- data.frame(text = thunberg_text) %>%
mutate(text_full = str_split(text, '\\n')) %>%
unnest(text_full)
speech_text <- thunberg_df %>% # Get the full speech
select(text_full) %>% # Only keep the text
slice(4:18) # Filter by row number
Now we just have the text in an easy-to-use format.
We can use tidytext::unnest_tokens to separate all the words
sep_words <- speech_text %>%
unnest_tokens(word, text_full)
Then count how many times they each show up…
word_counts <- sep_words %>%
count(word, sort = TRUE)
…but a lot of those words aren’t really things we’re interested in counting… …luckily, there’s a thing for that.
“Stop words” are common words that aren’t generally relevant for searching or analyzing things. We can have R remove those.
words_stop <- sep_words %>%
anti_join(stop_words) # Remove the stop words
## Joining, by = "word"
# And we can count them
word_count <- words_stop %>%
count(word, sort = TRUE) # Count words and arrange
Some intro sentiment analysis.
First, check out the ‘sentiments’ lexicon. From tidytext (https://www.tidytextmining.com/sentiment.html):
“The three general-purpose lexicons are
All three of these lexicons are based on unigrams, i.e., single words. These lexicons contain many English words and the words are assigned scores for positive/negative sentiment, and also possibly emotions like joy, anger, sadness, and so forth. The nrc lexicon categorizes words in a binary fashion (“yes”/“no”) into categories of positive, negative, anger, anticipation, disgust, fear, joy, sadness, surprise, and trust. The bing lexicon categorizes words in a binary fashion into positive and negative categories. The AFINN lexicon assigns words with a score that runs between -5 and 5, with negative scores indicating negative sentiment and positive scores indicating positive sentiment. All of this information is tabulated in the sentiments dataset, and tidytext provides a function get_sentiments() to get specific sentiment lexicons without the columns that are not used in that lexicon."
get_sentiments("afinn")
## # A tibble: 2,476 x 2
## word score
## <chr> <int>
## 1 abandon -2
## 2 abandoned -2
## 3 abandons -2
## 4 abducted -2
## 5 abduction -2
## 6 abductions -2
## 7 abhor -3
## 8 abhorred -3
## 9 abhorrent -3
## 10 abhors -3
## # … with 2,466 more rows
# Examples of really awesome words:
pos_words <- get_sentiments("afinn") %>%
filter(score == 5 | score == 4) %>%
head(20)
# You can look up negative words on your own, (but yes, it includes the worst words you can think of)
neutral_words <- get_sentiments("afinn") %>%
filter(between(score,-1,1)) %>%
head(20)
# Explore the other sentiment lexicons:
get_sentiments("nrc") # Assigns words to sentiment "groups"
## # A tibble: 13,901 x 2
## word sentiment
## <chr> <chr>
## 1 abacus trust
## 2 abandon fear
## 3 abandon negative
## 4 abandon sadness
## 5 abandoned anger
## 6 abandoned fear
## 7 abandoned negative
## 8 abandoned sadness
## 9 abandonment anger
## 10 abandonment fear
## # … with 13,891 more rows
get_sentiments("bing") # Binary; either "positive" or "negative"
## # A tibble: 6,788 x 2
## word sentiment
## <chr> <chr>
## 1 2-faced negative
## 2 2-faces negative
## 3 a+ positive
## 4 abnormal negative
## 5 abolish negative
## 6 abominable negative
## 7 abominably negative
## 8 abominate negative
## 9 abomination negative
## 10 abort negative
## # … with 6,778 more rows
So how do the non-stop-words in Greta’s speech get ranked?
# Recall what those words looked like:
words_stop
## word
## 1 greta
## 2 thunberg
## 3 15
## 4 sweden
## 5 speak
## 6 behalf
## 7 climate
## 8 justice
## 9 people
## 10 sweden
## 11 country
## 12 doesn’t
## 13 matter
## 14 i’ve
## 15 learned
## 16 difference
## 17 children
## 18 headlines
## 19 world
## 20 school
## 21 imagine
## 22 speak
## 23 matter
## 24 uncomfortable
## 25 speak
## 26 green
## 27 eternal
## 28 economic
## 29 growth
## 30 scared
## 31 unpopular
## 32 talk
## 33 moving
## 34 forward
## 35 bad
## 36 ideas
## 37 mess
## 38 pull
## 39 emergency
## 40 brake
## 41 mature
## 42 burden
## 43 leave
## 44 children
## 45 don’t
## 46 care
## 47 popular
## 48 care
## 49 climate
## 50 justice
## 51 living
## 52 planet
## 53 civilization
## 54 sacrificed
## 55 opportunity
## 56 people
## 57 continue
## 58 enormous
## 59 amounts
## 60 money
## 61 biosphere
## 62 sacrificed
## 63 rich
## 64 people
## 65 countries
## 66 mine
## 67 live
## 68 luxury
## 69 sufferings
## 70 pay
## 71 luxuries
## 72 2078
## 73 celebrate
## 74 75th
## 75 birthday
## 76 children
## 77 spend
# Let's see how they're aligned with each of the different lexicons
# Note: the context is lost. Let's see how that manifests here.
sent_afinn <- words_stop %>%
inner_join(get_sentiments("afinn"))
## Joining, by = "word"
sent_nrc <- words_stop %>%
inner_join(get_sentiments("nrc"))
## Joining, by = "word"
sent_bing <- words_stop %>%
inner_join(get_sentiments("bing"))
## Joining, by = "word"
Then you can imagine there are different ways to quantify those outcomes…
Here are just a few examples:
# What are the most common sentiment groups (by NRC)?
nrc_count <- sent_nrc %>%
group_by(sentiment) %>%
tally()
nrc_count
## # A tibble: 10 x 2
## sentiment n
## <chr> <int>
## 1 anger 2
## 2 anticipation 5
## 3 disgust 3
## 4 fear 2
## 5 joy 5
## 6 negative 6
## 7 positive 14
## 8 sadness 4
## 9 surprise 4
## 10 trust 8
# Orrr we can just count up the positive and negative outcomes from bing:
bing_count <- sent_bing %>%
group_by(sentiment) %>%
tally()
bing_count
## # A tibble: 2 x 2
## sentiment n
## <chr> <int>
## 1 negative 9
## 2 positive 5
# Or we can find a mean or median score based on the afinn lexicon:
afinn_summary <- sent_afinn %>%
summarize(
mean = mean(score),
sd = sd(score),
median = median(score)
)
Make a word cloud:
wordcloud(word_count$word,
freq = word_count$n,
min.freq = 1,
max.words = 65,
scale = c(2, 0.1),
colors = brewer.pal(3, "Dark2"))